In [1]:
% matplotlib inline
import pandas as pd
from dateutil.relativedelta import relativedelta
import statsmodels.formula.api as sm
import requests
import pickle
from user_object import User
Our measures of user activity over a time span include:
Our measures of harassment received/made over a time span are:
clf
scored above threshold
threshold
for any of our 3 harassment classifersthreshold
for any of our 3 harassment classifiersWe also gather:
As mentioned above we, gather activity and harassment features for newcomers in timespan t1 and see how they correlate with activity features in timespan t2.
In the following analysis, the two time spans we are interested in are the first and second month after user registration.
In [2]:
def select_month_since_start(user, activity, t):
start = user.first_edit_day + relativedelta(months=(t-1))
stop = user.first_edit_day + relativedelta(months= t)
activity = activity[activity['timestamp'] < stop]
activity = activity[activity['timestamp'] >= start]
return activity
def count_edits(user, t):
activity = user.df_activity
if activity is None:
return 0
activity = select_month_since_start(user, activity, t)
return activity['n_revisions'].sum()
def count_ns0_revisions(user, t):
activity = user.df_activity
if activity is None:
return 0
activity = select_month_since_start(user, activity, t)
activity = activity.query("ns=='0'")
return activity['n_revisions'].sum()
def count_days_active(user, t):
activity = user.df_activity
if user.df_activity is None:
return 0
activity = select_month_since_start(user, activity, t)
return len(activity.timestamp.unique())
def count_score_received_above_threshold(user, score, threshold, t):
if user.df_comments_to is None:
return 0
comments = user.df_comments_to
comments = select_month_since_start(user, comments, t)
return (comments[score] > threshold).sum()
def count_score_made_above_threshold(user, score, threshold, t):
if user.df_comments_from is None:
return 0
comments = user.df_comments_from
comments = select_month_since_start(user, comments, t)
return (comments[score] > threshold).sum()
def is_female(u):
return int(u.gender == 'female')
def is_male(u):
return int(u.gender == 'male')
def count_warnings_received(user, t):
warnings = user.df_uw
if warnings is None:
return 0
warnings = select_month_since_start(user, warnings, t)
return len(warnings)
def count_fraction_of_ns0_revisions_x(user, x, t):
if user.df_activity is None:
return 0
activity = user.df_activity.query("ns=='0'")
activity = select_month_since_start(user, activity, t)
if activity['n_revisions'].sum() < 1:
return 0
return float(activity[x].sum()) / activity['n_revisions'].sum()
In [3]:
feature_map = {
'first_edit_day' : lambda u: u.first_edit_day,
'm1_num_ns0_edits' : lambda u: count_ns0_revisions(u, 1),
'user_id' : lambda u : u.user_id,
'is_female' : is_female,
'is_male' : is_male,
'has_gender' : lambda u: int(is_female(u) or is_male(u)),
'm1_num_edits' : lambda u: count_edits(u, 1) ,
'm2_num_edits' : lambda u: count_edits(u, 2),
'm1_num_days_active' : lambda u: count_days_active(u, 1),
'm2_num_days_active' : lambda u: count_days_active(u, 2),
'm1_num_warnings_recieved' : lambda u: count_warnings_received(u, 1),
'm1_fraction_ns0_deleted' : lambda u: count_fraction_of_ns0_revisions_x(u, 'n_deleted_revisions', 1) ,
'm1_fraction_ns0_reverted' : lambda u: count_fraction_of_ns0_revisions_x(u, 'n_identity_reverted_revisions', 1) ,
'm1_fraction_ns0_productive' : lambda u: count_fraction_of_ns0_revisions_x(u, 'n_productive_revisions', 1) ,
'm1_active' : lambda u: int(count_edits(u, 1) > 0),
'm2_active' : lambda u: int(count_edits(u, 2) > 0),
}
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.01)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression', 0.01, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.01)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'aggression', 0.01, 1)
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.425)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression', 0.425, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.425)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'aggression', 0.425, 1)
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.75)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression', 0.75, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.75)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'aggression', 0.75, 1)
feature_map['m1_num_%s_received_%.3f' % ('aggression', 0.85)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'aggression', 0.85, 1)
feature_map['m1_num_%s_made_%.3f' % ('aggression', 0.85)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'aggression', 0.85, 1)
feature_map['m1_num_%s_received_%.3f' % ('attack', 0.01)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack', 0.01, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.01)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'attack', 0.01, 1)
feature_map['m1_num_%s_received_%.3f' % ('attack', 0.425)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack', 0.425, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.425)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'attack', 0.425, 1)
feature_map['m1_num_%s_received_%.3f' % ('attack', 0.75)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack', 0.75, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.75)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'attack', 0.75, 1)
feature_map['m1_num_%s_received_%.3f' % ('attack', 0.85)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'attack', 0.85, 1)
feature_map['m1_num_%s_made_%.3f' % ('attack', 0.85)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'attack', 0.85, 1)
feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.01)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity', 0.01, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.01)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'toxicity', 0.01, 1)
feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.425)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity', 0.425, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.425)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'toxicity', 0.425, 1)
feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.75)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity', 0.75, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.75)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'toxicity', 0.75, 1)
feature_map['m1_num_%s_received_%.3f' % ('toxicity', 0.85)] = lambda u: count_score_received_above_threshold(u, 'pred_%s_score' % 'toxicity', 0.85, 1)
feature_map['m1_num_%s_made_%.3f' % ('toxicity', 0.85)] = lambda u: count_score_made_above_threshold(u, 'pred_%s_score' % 'toxicity', 0.85, 1)
In [4]:
random_user_objects = pickle.load(open("../../data/retention/random_user_data.pkl", "rb"))
In [5]:
d = {k : [v(u) for u in random_user_objects] for k,v in feature_map.items()}
df_features = pd.DataFrame(d)
df_features.index = df_features.user_id
del df_features['user_id']
print(df_features.shape)
df_active = df_features.query('m1_active == 1')
print(df_active.shape[0])
df_active.to_csv("../../data/retention/random_user_sample_features.csv")
In [6]:
attacked_user_objects = pickle.load(open("../../data/retention/attacked_user_data.pkl", "rb"))
In [7]:
d = {k : [v(u) for u in attacked_user_objects] for k,v in feature_map.items()}
df_features = pd.DataFrame(d)
df_features.index = df_features.user_id
del df_features['user_id']
print(df_features.shape)
df_active = df_features.query('m1_active == 1')
print(df_active.shape[0])
df_active.to_csv("../../data/retention/attacked_user_sample_features.csv")